# Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score, silhouette_samples
from plot_silhouette import plot_silhouette
# Variables
file_name = 'lab_exercise.csv'
separator = ','
random_state = 42
# Directives
%matplotlib inline
np.random.seed(random_state)
# Read the file
X = pd.read_csv(file_name, delimiter = separator, header = None)
# Produce the boxplots
plt.figure(figsize=(15,15))
pos = 1
for i in X.columns:
plt.subplot(3, 4, pos) # It looks bad otherwise
sns.boxplot(X[i])
pos += 1
The boxplots show that there are no outliers, the distribution of 0 and 3 is very similar. 1 and 2 have a similar median value but different distribution of values. There doesn't seem to be any particular situaion showing.
sns.pairplot(X)
<seaborn.axisgrid.PairGrid at 0x2588e1ca340>
From the pairplot it is clear that the columns 1 and 2 tend to form quite distinct clusters. They're probably our best bet for our clustering efforts.
In order to find a clustering scheme, we will use K-means with the elbow method, ranging from 2 to 10 clusters
k_range = range(2,11)
# Distortion and Silhouette Score as measures
distortions = []
silhouette_scores = []
for i in k_range:
km = KMeans(n_clusters = i,
init = 'k-means++',
n_init = 10,
max_iter = 300,
random_state = random_state)
y_km = km.fit_predict(X)
distortions.append(km.inertia_)
silhouette_scores.append(silhouette_score(X,y_km))
fig, ax1 = plt.subplots()
color = 'tab:red'
ax1.set_xlabel('Number of clusters')
ax1.set_ylabel('Inertia', color=color)
ax1.plot(k_range, distortions, color=color)
ax1.tick_params(axis='y', labelcolor=color)
ax2 = ax1.twinx() # instantiate a second axes that shares the same x-axis
color = 'tab:blue'
ax2.set_ylabel('Silhouette scores', color=color) # we already handled the x-label with ax1
ax2.plot(k_range, silhouette_scores, color=color)
ax2.tick_params(axis='y', labelcolor=color)
ax2.set_ylim(0,1) # the axis for silhouette is [0,1]
fig.tight_layout() # otherwise the right y-label is slightly clipped
plt.show()
Both the silhouette scores and the inertia elbow suggest that the best number of clusters is 4, which is in line with what we were expecting, given the initial pairplots
best_k = 4
# Create a new Kmeans classifier with the best parameter we found
km = KMeans(n_clusters = 4,
init = 'k-means++',
n_init = 10,
max_iter = 300,
random_state = random_state)
# Predict the cluster labels
y_km = km.fit_predict(X)
In order to use the predicted labels as hue we will add it to a new dataframe using the assign method
X_pred = X.assign(pred_class = y_km)
sns.pairplot(X_pred, hue = 'pred_class')
<seaborn.axisgrid.PairGrid at 0x2588e9e3250>
In order to perform this task, we will use the plot_silhouette function that was introduced in the exercises in class
# Compute the Silhouette Coefficient for each sample, with the euclidean metric
silhouette_score_samples = silhouette_samples(X, y_km, metric='euclidean')
plt.title(f"Silhouette score for samples with {best_k} clusters")
plot_silhouette(silhouette_score_samples, y_km)
To make this task easier, we leverage numpy's function bincount
occurrences = np.bincount(y_km)
bincount created an array that contains as index the cluster numbers, as value, the elements in that cluster. We can then create tuples to have this association in an explicit way
item_cluster_tuples = [(qty, idx) for idx, qty in enumerate(occurrences)]
item_cluster_tuples
[(375, 0), (376, 1), (359, 2), (390, 3)]
We can now sort the tuples and extract the cluster index to obtain what was requested
sorted_clusters = [i[1] for i in sorted(item_cluster_tuples, reverse = True)]
sorted_clusters
[3, 1, 0, 2]